Your Title Here¶

Name(s): Alyvia Vaughan and Katie Hannigan

Website Link: (your website link)

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

Step 1: Introduction¶

In [3]:
# TODO
interactions_df = pd.read_csv('interactions.csv')
raw_recipes_df = pd.read_csv('RAW_recipes.csv')
In [4]:
interactions_df[interactions_df['rating'] == 0]['review']
Out[4]:
3         Just an observation, so I will not rate.  I fo...
5         Made my own buttermilk w/ vinegar and milk.  U...
10        This is a very good recipe.  We also want to c...
                                ...                        
731888    Delicious ! I tweeked the recipe a bit>>substi...
731893    Just added this mix to a homemade beef & s...
731895    Would this make a good hamburger patty seasoning?
Name: review, Length: 51832, dtype: object
In [5]:
raw_recipes_df
Out[5]:
name id minutes contributor_id ... steps description ingredients n_ingredients
0 1 brownies in the world best ever 333281 40 985201 ... ['heat the oven to 350f and arrange the rack i... these are the most; chocolatey, moist, rich, d... ['bittersweet chocolate', 'unsalted butter', '... 9
1 1 in canada chocolate chip cookies 453467 45 1848091 ... ['pre-heat oven the 350 degrees f', 'in a mixi... this is the recipe that we use at my school ca... ['white sugar', 'brown sugar', 'salt', 'margar... 11
2 412 broccoli casserole 306168 40 50969 ... ['preheat oven to 350 degrees', 'spray a 2 qua... since there are already 411 recipes for brocco... ['frozen broccoli cuts', 'cream of chicken sou... 9
... ... ... ... ... ... ... ... ... ...
83779 zydeco ya ya deviled eggs 308080 40 37779 ... ['in a bowl , combine the mashed yolks and may... deviled eggs, cajun-style ['hard-cooked eggs', 'mayonnaise', 'dijon must... 8
83780 cookies by design cookies on a stick 298512 29 506822 ... ['place melted butter in a large mixing bowl a... i've heard of the 'cookies by design' company,... ['butter', 'eagle brand condensed milk', 'ligh... 10
83781 cookies by design sugar shortbread cookies 298509 20 506822 ... ['whip sugar and shortening in a large bowl , ... i've heard of the 'cookies by design' company,... ['granulated sugar', 'shortening', 'eggs', 'fl... 7

83782 rows × 12 columns

In [6]:
recipes_full_df = pd.merge(raw_recipes_df, interactions_df, how='left', left_on='id', right_on='recipe_id')
recipes_full_df['rating'] = recipes_full_df['rating'].replace(0, np.nan)

average_rating_per_recipe = recipes_full_df.groupby('recipe_id')['rating'].mean()
average_rating_per_recipe_df = average_rating_per_recipe.reset_index(name='avg_rating')
recipes_df = recipes_full_df.merge(average_rating_per_recipe_df, on='recipe_id', how='left')

recipes_df
Out[6]:
name id minutes contributor_id ... date rating review avg_rating
0 1 brownies in the world best ever 333281 40 985201 ... 2008-11-19 4.0 These were pretty good, but took forever to ba... 4.0
1 1 in canada chocolate chip cookies 453467 45 1848091 ... 2012-01-26 5.0 Originally I was gonna cut the recipe in half ... 5.0
2 412 broccoli casserole 306168 40 50969 ... 2008-12-31 5.0 This was one of the best broccoli casseroles t... 5.0
... ... ... ... ... ... ... ... ... ...
234426 cookies by design sugar shortbread cookies 298509 20 506822 ... 2008-06-19 1.0 This recipe tastes nothing like the Cookies by... 3.0
234427 cookies by design sugar shortbread cookies 298509 20 506822 ... 2010-02-08 5.0 yummy cookies, i love this recipe me and my sm... 3.0
234428 cookies by design sugar shortbread cookies 298509 20 506822 ... 2014-11-01 NaN I work at a Cookies By Design and can say this... 3.0

234429 rows × 18 columns

In [7]:
recipes_df['date']
Out[7]:
0         2008-11-19
1         2012-01-26
2         2008-12-31
             ...    
234426    2008-06-19
234427    2010-02-08
234428    2014-11-01
Name: date, Length: 234429, dtype: object

Step 2: Data Cleaning and Exploratory Data Analysis¶

In [8]:
tester_nutrition_line = raw_recipes_df['nutrition'][0]
tester_nutrition_line.strip('[').strip(']').split(',')
Out[8]:
['138.4', ' 10.0', ' 50.0', ' 3.0', ' 3.0', ' 19.0', ' 6.0']
In [9]:
# TODO
# separating nutrition column into multiple columns
nutrition_split = recipes_df['nutrition'].str.strip('[').str.strip(']').str.split(',', expand=True)
nutrition_split.columns = ['calories (#)', 'total fat (PDV)', 'sugar (PDV)', 'sodium (PDV)', 'protein (PDV)', 'saturated fat (PDV)', 'carbohydrates (PDV)']
nutrition_split = nutrition_split.apply(pd.to_numeric, errors='coerce')
recipes_df = pd.concat([recipes_df, nutrition_split], axis=1).drop(columns=['nutrition'])
recipes_df
Out[9]:
name id minutes contributor_id ... sodium (PDV) protein (PDV) saturated fat (PDV) carbohydrates (PDV)
0 1 brownies in the world best ever 333281 40 985201 ... 3.0 3.0 19.0 6.0
1 1 in canada chocolate chip cookies 453467 45 1848091 ... 22.0 13.0 51.0 26.0
2 412 broccoli casserole 306168 40 50969 ... 32.0 22.0 36.0 3.0
... ... ... ... ... ... ... ... ... ...
234426 cookies by design sugar shortbread cookies 298509 20 506822 ... 4.0 4.0 11.0 6.0
234427 cookies by design sugar shortbread cookies 298509 20 506822 ... 4.0 4.0 11.0 6.0
234428 cookies by design sugar shortbread cookies 298509 20 506822 ... 4.0 4.0 11.0 6.0

234429 rows × 24 columns

In [10]:
#ingredients, steps, and tags columns are strings that look like lists, changing them to be lists

recipes_cleaned_df = recipes_df.assign(
    steps=recipes_df['steps'].str.strip('[]').str.replace("'", "").str.split(', '),
    ingredients=recipes_df['ingredients'].str.strip('[]').str.replace("'", "").str.split(', '),
    tags=recipes_df['tags'].str.strip('[]').str.replace("'", "").str.split(', ')
)

recipes_cleaned_df
Out[10]:
name id minutes contributor_id ... sodium (PDV) protein (PDV) saturated fat (PDV) carbohydrates (PDV)
0 1 brownies in the world best ever 333281 40 985201 ... 3.0 3.0 19.0 6.0
1 1 in canada chocolate chip cookies 453467 45 1848091 ... 22.0 13.0 51.0 26.0
2 412 broccoli casserole 306168 40 50969 ... 32.0 22.0 36.0 3.0
... ... ... ... ... ... ... ... ... ...
234426 cookies by design sugar shortbread cookies 298509 20 506822 ... 4.0 4.0 11.0 6.0
234427 cookies by design sugar shortbread cookies 298509 20 506822 ... 4.0 4.0 11.0 6.0
234428 cookies by design sugar shortbread cookies 298509 20 506822 ... 4.0 4.0 11.0 6.0

234429 rows × 24 columns

In [11]:
# change date and submitted column to datetime
recipes_cleaned_df['date_review_submitted'] = pd.to_datetime(recipes_cleaned_df['submitted'])
recipes_cleaned_df['date_recipe_posted'] = pd.to_datetime(recipes_cleaned_df['date'])

recipes_cleaned_df = recipes_cleaned_df.drop(columns=['date', 'submitted'])
recipes_cleaned_df
Out[11]:
name id minutes contributor_id ... saturated fat (PDV) carbohydrates (PDV) date_review_submitted date_recipe_posted
0 1 brownies in the world best ever 333281 40 985201 ... 19.0 6.0 2008-10-27 2008-11-19
1 1 in canada chocolate chip cookies 453467 45 1848091 ... 51.0 26.0 2011-04-11 2012-01-26
2 412 broccoli casserole 306168 40 50969 ... 36.0 3.0 2008-05-30 2008-12-31
... ... ... ... ... ... ... ... ... ...
234426 cookies by design sugar shortbread cookies 298509 20 506822 ... 11.0 6.0 2008-04-15 2008-06-19
234427 cookies by design sugar shortbread cookies 298509 20 506822 ... 11.0 6.0 2008-04-15 2010-02-08
234428 cookies by design sugar shortbread cookies 298509 20 506822 ... 11.0 6.0 2008-04-15 2014-11-01

234429 rows × 24 columns

In [12]:
# adding columns for nutrition that is represented in PDV to be in grams
# Pre-2016 mask
mask_pre = recipes_cleaned_df['date_recipe_posted'].dt.year < 2016

recipes_cleaned_df.loc[mask_pre, 'total fat (g)'] = recipes_cleaned_df.loc[mask_pre, 'total fat (PDV)'] * 65 / 100
recipes_cleaned_df.loc[mask_pre, 'sugar (g)'] = recipes_cleaned_df.loc[mask_pre, 'sugar (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_pre, 'sodium (mg)'] = recipes_cleaned_df.loc[mask_pre, 'sodium (PDV)'] * 2400 / 100
recipes_cleaned_df.loc[mask_pre, 'protein (g)'] = recipes_cleaned_df.loc[mask_pre, 'protein (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_pre, 'saturated fat (g)'] = recipes_cleaned_df.loc[mask_pre, 'saturated fat (PDV)'] * 20 / 100
recipes_cleaned_df.loc[mask_pre, 'carbohydrates (g)'] = recipes_cleaned_df.loc[mask_pre, 'carbohydrates (PDV)'] * 300 / 100

# 2016+ mask
mask_post = recipes_cleaned_df['date_recipe_posted'].dt.year >= 2016

recipes_cleaned_df.loc[mask_post, 'total fat (g)'] = recipes_cleaned_df.loc[mask_post, 'total fat (PDV)'] * 78 / 100
recipes_cleaned_df.loc[mask_post, 'sugar (g)'] = recipes_cleaned_df.loc[mask_post, 'sugar (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_post, 'sodium (mg)'] = recipes_cleaned_df.loc[mask_post, 'sodium (PDV)'] * 2300 / 100
recipes_cleaned_df.loc[mask_post, 'protein (g)'] = recipes_cleaned_df.loc[mask_post, 'protein (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_post, 'saturated fat (g)']  = recipes_cleaned_df.loc[mask_post, 'saturated fat (PDV)'] * 20 / 100
recipes_cleaned_df.loc[mask_post, 'carbohydrates (g)']  = recipes_cleaned_df.loc[mask_post, 'carbohydrates (PDV)'] * 275 / 100
In [13]:
recipes_cleaned_df
Out[13]:
name id minutes contributor_id ... sodium (mg) protein (g) saturated fat (g) carbohydrates (g)
0 1 brownies in the world best ever 333281 40 985201 ... 72.0 1.5 3.8 18.0
1 1 in canada chocolate chip cookies 453467 45 1848091 ... 528.0 6.5 10.2 78.0
2 412 broccoli casserole 306168 40 50969 ... 768.0 11.0 7.2 9.0
... ... ... ... ... ... ... ... ... ...
234426 cookies by design sugar shortbread cookies 298509 20 506822 ... 96.0 2.0 2.2 18.0
234427 cookies by design sugar shortbread cookies 298509 20 506822 ... 96.0 2.0 2.2 18.0
234428 cookies by design sugar shortbread cookies 298509 20 506822 ... 96.0 2.0 2.2 18.0

234429 rows × 30 columns

In [14]:
fig = px.histogram(recipes_cleaned_df, x='rating', nbins=50, title='Distribution of Ratings')
fig.show() 
In [15]:
recipes_cleaned_df['year'] = recipes_cleaned_df['date_review_submitted'].dt.year
grouped = recipes_cleaned_df.groupby('year')['avg_rating'].mean().reset_index()

fig = px.line(grouped, x='year', y='avg_rating', title='Average Rating Over Time')
fig.show()
In [16]:
def clean_tags(val):
    if isinstance(val, list):
        return val
    elif isinstance(val, str):
        return val.strip("[]").replace("'", "").split(", ")
    else:
        return []

recipes_cleaned_df['tags'] = recipes_cleaned_df['tags'].apply(clean_tags)
# baked_keywords = ['baking', 'baked', 'cake', 'cookie', 'bread', 'muffin',
#                   'brownie', 'biscuit', 'pie', 'pastry', 'cupcake']

baked_keywords = [
    'baking',
    'bread-machine',
    'bread',
    'breads',
    'quick-breads',
    'cakes',
    'cake-fillings-and-frostings',
    'cheesecake',
    'cupcakes',
    'cookies-and-brownies',
    'drop-cookies',
    'rolled-cookies',
    'bar-cookies',
    'brownies',
    'biscotti',
    'muffins',
    'pies',
    'pies-and-tarts',
    'tart',
    'tarts',
    'scones',
    'quiche',
    'bread-pudding',
    'puddings-and-mousses',
    'yeast',
    'flat-shapes',
    'crusts-pastry-dough-2',
    'fillings-and-frostings-chocolate',
    'dessert',
    'desserts',
    'desserts-easy',
    'desserts-fruit',
    'halloween-cakes',
    'halloween-cupcakes',
    'baked',
    'cake',
    'cookie',
    'cookies'
    'bread',
    'muffin',
    'brownie',
    'brownies',
    'biscuit',
    'pie',
    'pastry',
    'cupcake',
    'cupcakes'
]


# recipes_cleaned_df['is_baked_good'] = recipes_cleaned_df['tags'].apply(
#     lambda tags: any(isinstance(tag, str) and tag.lower() in baked_keywords for tag in tags)
# )

def is_baked(tags, name):
    tag_match = any(tag.lower() in baked_keywords for tag in tags if isinstance(tag, str))
    name_match = any(keyword in name.lower() for keyword in baked_keywords if isinstance(name, str))
    return tag_match or name_match

recipes_cleaned_df['is_baked_good'] = recipes_cleaned_df.apply(
    lambda row: is_baked(row['tags'], row['name']),
    axis=1
)
recipes_cleaned_df
Out[16]:
name id minutes contributor_id ... saturated fat (g) carbohydrates (g) year is_baked_good
0 1 brownies in the world best ever 333281 40 985201 ... 3.8 18.0 2008 True
1 1 in canada chocolate chip cookies 453467 45 1848091 ... 10.2 78.0 2011 True
2 412 broccoli casserole 306168 40 50969 ... 7.2 9.0 2008 False
... ... ... ... ... ... ... ... ... ...
234426 cookies by design sugar shortbread cookies 298509 20 506822 ... 2.2 18.0 2008 True
234427 cookies by design sugar shortbread cookies 298509 20 506822 ... 2.2 18.0 2008 True
234428 cookies by design sugar shortbread cookies 298509 20 506822 ... 2.2 18.0 2008 True

234429 rows × 32 columns

In [17]:
#Filter out outliers for clarity
filtered_df = recipes_cleaned_df[
    (recipes_cleaned_df['sugar (g)'] < 300) &
    (recipes_cleaned_df['protein (g)'] < 100) &
    (recipes_cleaned_df['saturated fat (g)'] < 100) &
    (recipes_cleaned_df['carbohydrates (g)'] < 300) &
    (recipes_cleaned_df['sodium (mg)'] < 4000) &
    (recipes_cleaned_df['calories (#)'] < 2000)
]

sample_df = filtered_df.sample(500, random_state=1)

# Univariate Analysis
fig1 = px.histogram(filtered_df, x='sugar (g)', nbins=50, title='Distribution of Sugar')
fig1.show()

fig2 = px.histogram(filtered_df, x='calories (#)', nbins=50, title='Distribution of Calories')
fig2.show()

# Bivariate Analysis
fig3 = px.box(filtered_df, x='is_baked_good', y='sugar (g)',
              title='Sugar Content by Baked Good Label')
fig3.show()

fig4 = px.box(filtered_df, x='is_baked_good', y='protein (g)',
              title='Protein Content by Baked Good Label')
fig4.show()

# Interesting Aggregates Table
agg_table = filtered_df.groupby('is_baked_good')[['sugar (g)', 'protein (g)', 'calories (#)']].mean().round(2)
print(agg_table)
               sugar (g)  protein (g)  calories (#)
is_baked_good                                      
False              16.90        17.68        360.94
True               38.31         8.76        346.79
In [18]:
# Average character length of description, grouped by is_baked_good
avg_lengths = filtered_df.groupby('is_baked_good')['description'].apply(lambda x: x.str.len().mean())

print("📏 Average description lengths:")
print(avg_lengths)
📏 Average description lengths:
is_baked_good
False    239.60
True     264.18
Name: description, dtype: float64

Step 3: Assessment of Missingness¶

In [19]:
# TODO
missing_counts = recipes_cleaned_df.isna().sum()

missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
})

# Filter to only show columns with missing values
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by most missing values
missing_df = missing_df.sort_values(by='Missing Count', ascending=False)

missing_df
# the top four missing columns are the rating, avg_rating, description, and review columns
# the avg_rating column would only be missing if that recipe did not have any ratings so its missingness is MD
# my theory on the missingnss of the rating column: i think that the website has a comment function and our data includes both comments and actual ratings scraped from website
# as such i think the missingness of the rating is NMAR and is a consequence of the data collection method (maybe the web scrapping pulled both comments and reviews or the website has no distinction between them)
# the missingness of the review column could just be that someone decided to leave a rating and not a write a description (NMAR)
# i think the missingness of the description column is quite interesting and odd (maybe could be MAR and depend on the steps column)
# for example, maybe the person writing the recipe decided to not put a description and just put the steps to the recipe
# n_steps and description (recipe that is easier has no description)
Out[19]:
Missing Count
rating 15036
avg_rating 2777
description 114
... ...
protein (g) 1
saturated fat (g) 1
carbohydrates (g) 1

14 rows × 1 columns

In [20]:
# missingness dependency on description column with permutation tests 

recipes_cleaned_df['description_missing'] = recipes_cleaned_df['description'].isna()

def permutation_test(df, col_missing, col_test, n_permutations=1000):
    copied_df = df.copy()
    copied_df = copied_df.dropna(subset=[col_test])

    missing_mask = copied_df[col_missing].values
    test_values = copied_df[col_test].values

    observed_diff = test_values[missing_mask].mean() - test_values[~missing_mask].mean()

    diffs = []
    for _ in range(n_permutations):
        shuffled_mask = np.random.permutation(missing_mask)
        diff = test_values[shuffled_mask].mean() - test_values[~shuffled_mask].mean()
        diffs.append(diff)

    p_value = np.mean(np.abs(diffs) >= np.abs(observed_diff))
    return observed_diff, p_value
In [21]:
# missingness dependency: description and n_steps 
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'n_steps')
print(f'n_steps: Observed diff: {diff:.4f}, p-value: {p}')
n_steps: Observed diff: 0.7194, p-value: 0.243
In [22]:
# missingness dependency: description and n_ingredients
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'n_ingredients')
print(f'n_ingredients: Observed diff: {diff:.4f}, p-value: {p}')
n_ingredients: Observed diff: -1.1335, p-value: 0.004
In [23]:
# missingness dependency: description and calories
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'calories (#)')
print(f'calories: Observed diff: {diff:.4f}, p-value: {p}')
calories: Observed diff: -101.9820, p-value: 0.046
In [24]:
# missingness dependency: description and average rating 
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'avg_rating')
print(f'average rating: Observed diff: {diff:.4f}, p-value: {p}')
average rating: Observed diff: -0.1903, p-value: 0.001
In [25]:
# missingness dependency: description and protein
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'protein (g)')
print(f'protein: Observed diff: {diff:.4f}, p-value: {p}')
protein: Observed diff: 2.0542, p-value: 0.276

n_steps: No significant difference in number of steps between recipes with vs without a description. n_ingredients: Yes, statistically significant difference. Recipes without descriptions tend to use fewer ingredients on average. calories: Might be statistically significant but weak significance. Recipes without descriptions might be lower calorie, but this is not statistically significant. avg_rating: Yes, statistically significant. Recipes without descriptions tend to have lower average ratings. protein: No statistical significance in protein content based on description missingness.

Step 4: Hypothesis Testing¶

In [26]:
def tag_based_permutation_test(df, tag_a, tag_b, score_col, n_permutations=1000):
    df_exploded = df.explode('tags')
    group_a = df_exploded[df_exploded['tags'] == tag_a][score_col].dropna()
    group_b = df_exploded[df_exploded['tags'] == tag_b][score_col].dropna()

    if len(group_a) == 0 or len(group_b) == 0:
        return np.nan, np.nan

    observed_diff = group_a.mean() - group_b.mean()
    combined = np.concatenate([group_a.values, group_b.values])
    diffs = []

    for _ in range(n_permutations):
        np.random.shuffle(combined)
        new_a = combined[:len(group_a)]
        new_b = combined[len(group_a):]
        diffs.append(new_a.mean() - new_b.mean())

    p_value = np.mean(np.abs(diffs) >= np.abs(observed_diff))
    return observed_diff, p_value
In [27]:
# What types of recipes tend to have the most calories? (based on tags)
# Dessert vs main dish

# Null hypothesis: Calories are around the same for desserts and main dishes.
# Alternative hypothesis: Calories for main dishes are less than calories for desserts.

diff, p = tag_based_permutation_test(recipes_cleaned_df, 'desserts', 'main-dish', 'calories (#)')
print(f"Observed diff: {diff}, p-value: {p}")
Observed diff: 3.4238541343218003, p-value: 0.443
In [28]:
# What types of recipes tend to have higher average ratings? (based on tags)
# Dessert vs main dish 

# Null hypothesis: Average ratings are around the same for desserts and main dishes.
# Alternative hypothesis: Main dishes are on average rated higher than desserts.

diff, p = tag_based_permutation_test(recipes_cleaned_df, 'desserts', 'main-dish', 'avg_rating')
print(f"Observed diff: {diff}, p-value: {p:8f}")
Observed diff: -0.043594616861519775, p-value: 0.000000
In [29]:
def permutation_test_diff(df, group_col, group_a, group_b, value_col, n_permutations=1000):
    group1 = df[df[group_col] == group_a][value_col].dropna()
    group2 = df[df[group_col] == group_b][value_col].dropna()

    if len(group1) == 0 or len(group2) == 0:
        return np.nan, np.nan

    observed_diff = group1.mean() - group2.mean()
    combined = np.concatenate([group1.values, group2.values])
    diffs = []
    for _ in range(n_permutations):
        np.random.shuffle(combined)
        new1 = combined[:len(group1)]
        new2 = combined[len(group1):]
        diffs.append(new1.mean() - new2.mean())

    p_value = np.mean(np.abs(diffs) >= np.abs(observed_diff))
    return observed_diff, p_value
In [30]:
# What types of recipes tend to be healthier (i.e. more protein, fewer carbs)?
# Are healthier recipes more highly rated recipes?

# Null hypothesis: Healthier recipes are on average rated the same as less healthier recipes.
# Alternative hypothesis: Healthier recipes are on average rated higher than less healthier recipes.

recipes_cleaned_df['health_score'] = recipes_cleaned_df['protein (g)'] - recipes_cleaned_df['carbohydrates (g)']
median_health = recipes_cleaned_df['health_score'].median()
recipes_cleaned_df['high_health'] = recipes_cleaned_df['health_score'] >= median_health

observed_diff, p = permutation_test_diff(recipes_cleaned_df, 'high_health', True, False, 'avg_rating')
print(f"Observed diff: {observed_diff}, p-value: {p:8f}")
Observed diff: 0.02099521814306815, p-value: 0.000000
In [31]:
# What is the relationship between the cooking time and average rating of recipes?

# Null hypothesis: Recipes with longer cook times are on average rated the same as recipes with longer cook times.
# Alternative hypothesis: Recipes with shorter cook times are on average rated better than recipes with longer cook times.

median_time = recipes_cleaned_df['minutes'].median()
recipes_cleaned_df['is_quick'] = recipes_cleaned_df['minutes'] < median_time

observed_diff, p = permutation_test_diff(recipes_cleaned_df, 'is_quick', True, False, 'avg_rating')
print(f"Observed diff: {observed_diff:.4f}, p-value: {p:.8f}")
Observed diff: 0.0342, p-value: 0.00000000
In [32]:
list(recipes_cleaned_df.explode('tags').groupby('tags').count().index)
Out[32]:
['',
 '1-day-or-more',
 '15-minutes-or-less',
 '3-steps-or-less',
 '30-minutes-or-less',
 '4-hours-or-less',
 '5-ingredients-or-less',
 '60-minutes-or-less',
 'Throw the ultimate fiesta with this sopaipillas recipe from Food.com.',
 'a1-sauce',
 'african',
 'american',
 'amish-mennonite',
 'angolan',
 'appetizers',
 'apples',
 'april-fools-day',
 'argentine',
 'artichoke',
 'asian',
 'asparagus',
 'australian',
 'austrian',
 'avocado',
 'bacon',
 'baja',
 'baked-beans',
 'baking',
 'bananas',
 'bar-cookies',
 'barbecue',
 'bass',
 'bean-soup',
 'beans',
 'beans-side-dishes',
 'bear',
 'beef',
 'beef-barley-soup',
 'beef-crock-pot',
 'beef-kidney',
 'beef-liver',
 'beef-organ-meats',
 'beef-ribs',
 'beef-sauces',
 'beef-sausage',
 'beginner-cook',
 'beijing',
 'belgian',
 'berries',
 'beverages',
 'birthday',
 'biscotti',
 'bisques-cream-soups',
 'black-bean-soup',
 'black-beans',
 'blueberries',
 'bok-choys',
 'brazilian',
 'bread-machine',
 'bread-pudding',
 'breads',
 'breakfast',
 'breakfast-casseroles',
 'breakfast-eggs',
 'breakfast-potatoes',
 'brewing',
 'british-columbian',
 'broccoli',
 'broil',
 'brown-bag',
 'brown-rice',
 'brownies',
 'brunch',
 'burgers',
 'cabbage',
 'cajun',
 'cake-fillings-and-frostings',
 'cakes',
 'californian',
 'cambodian',
 'camping',
 'canadian',
 'candy',
 'canning',
 'cantonese',
 'caribbean',
 'carrots',
 'casseroles',
 'catfish',
 'cauliflower',
 'celebrity',
 'central-american',
 'chard',
 'cheese',
 'cheesecake',
 'cherries',
 'chick-peas-garbanzos',
 'chicken',
 'chicken-breasts',
 'chicken-crock-pot',
 'chicken-livers',
 'chicken-stew',
 'chicken-stews',
 'chicken-thighs-legs',
 'chilean',
 'chili',
 'chinese',
 'chinese-new-year',
 'chocolate',
 'chocolate-chip-cookies',
 'chowders',
 'christmas',
 'chutneys',
 'cinco-de-mayo',
 'citrus',
 'clams',
 'clear-soups',
 'cobblers-and-crisps',
 'cocktails',
 'coconut',
 'cod',
 'coffee-cakes',
 'collard-greens',
 'college',
 'colombian',
 'comfort-food',
 'condiments-etc',
 'congolese',
 'cookies-and-brownies',
 'cooking-mixes',
 'copycat',
 'corn',
 'costa-rican',
 'course',
 'crab',
 'cranberry-sauce',
 'crawfish',
 'creole',
 'crock-pot-main-dish',
 'crock-pot-slow-cooker',
 'crusts-pastry-dough-2',
 'cuban',
 'cuisine',
 'cupcakes',
 'curries',
 'czech',
 'dairy-free',
 'danish',
 'deep-fry',
 'deer',
 'dehydrator',
 'desserts',
 'desserts-easy',
 'desserts-fruit',
 'diabetic',
 'dietary',
 'dinner-party',
 'dips',
 'dips-lunch-snacks',
 'drop-cookies',
 'duck',
 'duck-breasts',
 'dutch',
 'easter',
 'easy',
 'ecuadorean',
 'egg-free',
 'eggplant',
 'eggs',
 'eggs-breakfast',
 'eggs-dairy',
 'egyptian',
 'elbow-macaroni',
 'elk',
 'english',
 'equipment',
 'ethiopian',
 'european',
 'fall',
 'fathers-day',
 'filipino',
 'fillings-and-frostings-chocolate',
 'finger-food',
 'finnish',
 'fish',
 'flat-shapes',
 'food-processor-blender',
 'for-1-or-2',
 'for-large-groups',
 'for-large-groups-holiday-event',
 'free-of-something',
 'freezer',
 'french',
 'freshwater-fish',
 'from-scratch',
 'frozen-desserts',
 'fruit',
 'fudge',
 'garnishes',
 'gelatin',
 'georgian',
 'german',
 'gifts',
 'gluten-free',
 'goose',
 'grains',
 'granola-and-porridge',
 'grapes',
 'greek',
 'green-yellow-beans',
 'greens',
 'grilling',
 'ground-beef',
 'guatemalan',
 'gumbo',
 'halibut',
 'halloween',
 'halloween-cakes',
 'halloween-cocktails',
 'halloween-cupcakes',
 'ham',
 'ham-and-bean-soup',
 'hand-formed-cookies',
 'hanukkah',
 'hawaiian',
 'healthy',
 'healthy-2',
 'heirloom-historical',
 'heirloom-historical-recipes',
 'herb-and-spice-mixes',
 'hidden-valley-ranch',
 'high-calcium',
 'high-fiber',
 'high-in-something',
 'high-in-something-diabetic-friendly',
 'high-protein',
 'holiday-event',
 'honduran',
 'hunan',
 'hungarian',
 'ice-cream',
 'icelandic',
 'independence-day',
 'indian',
 'indonesian',
 'inexpensive',
 'infant-baby-friendly',
 'iranian-persian',
 'iraqi',
 'irish',
 'irish-st-patricks-day',
 'italian',
 'jams-and-preserves',
 'japanese',
 'jellies',
 'jewish-ashkenazi',
 'jewish-sephardi',
 'kid-friendly',
 'kiwifruit',
 'korean',
 'kosher',
 'kwanzaa',
 'labor-day',
 'lactose',
 'lamb-sheep',
 'lamb-sheep-main-dish',
 'laotian',
 'lasagna',
 'lasagne',
 'lebanese',
 'leftovers',
 'lemon',
 'lentils',
 'less_thansql:name_topics_of_recipegreater_than',
 'lettuces',
 'libyan',
 'lime',
 'lobster',
 'long-grain-rice',
 'low-calorie',
 'low-carb',
 'low-cholesterol',
 'low-fat',
 'low-in-something',
 'low-protein',
 'low-saturated-fat',
 'low-sodium',
 'lunch',
 'macaroni-and-cheese',
 'mahi-mahi',
 'main-dish',
 'main-dish-beef',
 'main-dish-chicken',
 'main-dish-pasta',
 'main-dish-pork',
 'main-dish-seafood',
 'main-ingredient',
 'malaysian',
 'mango',
 'manicotti',
 'mardi-gras-carnival',
 'marinades-and-rubs',
 'marinara-sauce',
 'mashed-potatoes',
 'meat',
 'meatballs',
 'meatloaf',
 'medium-grain-rice',
 'melons',
 'memorial-day',
 'mexican',
 'micro-melanesia',
 'microwave',
 'middle-eastern',
 'middle-eastern-main-dish',
 'midwestern',
 'mixer',
 'mongolian',
 'moose',
 'moroccan',
 'mothers-day',
 'muffins',
 'mushroom-soup',
 'mushrooms',
 'mussels',
 'namibian',
 'native-american',
 'nepalese',
 'new-years',
 'new-zealand',
 'nigerian',
 'no-cook',
 'no-shell-fish',
 'non-alcoholic',
 'north-american',
 'northeastern-united-states',
 'norwegian',
 'novelty',
 'number-of-servings',
 'nut-free',
 'nuts',
 'oamc-freezer-make-ahead',
 'oatmeal',
 'oaxacan',
 'occasion',
 'octopus',
 'omelets-and-frittatas',
 'one-dish-meal',
 'onions',
 'ontario',
 'orange-roughy',
 'oranges',
 'oven',
 'oysters',
 'pacific-northwest',
 'pakistani',
 'palestinian',
 'pancakes-and-waffles',
 'papaya',
 'passover',
 'pasta',
 'pasta-elbow-macaroni',
 'pasta-rice-and-grains',
 'pasta-salad',
 'pasta-shells',
 'peaches',
 'peanut-butter',
 'pears',
 'penne',
 'pennsylvania-dutch',
 'peppers',
 'perch',
 'peruvian',
 'pheasant',
 'pickeral',
 'picnic',
 'pies',
 'pies-and-tarts',
 'pineapple',
 'pitted-fruit',
 'pizza',
 'plums',
 'polish',
 'polynesian',
 'pork',
 'pork-chops',
 'pork-crock-pot',
 'pork-loin',
 'pork-loins',
 'pork-loins-roast',
 'pork-ribs',
 'pork-sausage',
 'portuguese',
 'pot-pie',
 'pot-roast',
 'potatoes',
 'potluck',
 'poultry',
 'preparation',
 'prepared-potatoes',
 'presentation',
 'pressure-canning',
 'pressure-cooker',
 'puddings-and-mousses',
 'puerto-rican',
 'pumpkin',
 'pumpkin-bread',
 'punch',
 'quebec',
 'quiche',
 'quick-breads',
 'rabbit',
 'ragu-recipe-contest',
 'ramadan',
 'raspberries',
 'ravioli-tortellini',
 'refrigerator',
 'reynolds-wrap',
 'rice',
 'roast',
 'roast-beef',
 'roast-beef-comfort-food',
 'roast-beef-main-dish',
 'rolled-cookies',
 'rolls-biscuits',
 'romantic',
 'rosh-hashana',
 'rosh-hashanah',
 'russian',
 'salad-dressings',
 'salads',
 'salmon',
 'salsas',
 'saltwater-fish',
 'sandwiches',
 'sauces',
 'saudi-arabian',
 'savory',
 'savory-pies',
 'savory-sauces',
 'scallops',
 'scandinavian',
 'scones',
 'scottish',
 'seafood',
 'seasonal',
 'served-cold',
 'served-hot',
 'served-hot-new-years',
 'shakes',
 'shellfish',
 'short-grain-rice',
 'shrimp',
 'shrimp-main-dish',
 'side-dishes',
 'side-dishes-beans',
 'simply-potatoes',
 'simply-potatoes2',
 'small-appliance',
 'smoker',
 'smoothies',
 'snacks',
 'snacks-kid-friendly',
 'snacks-sweet',
 'sole-and-flounder',
 'somalian',
 'soul',
 'soups-stews',
 'sourdough',
 'south-african',
 'south-american',
 'south-west-pacific',
 'southern-united-states',
 'southwestern-united-states',
 'soy-tofu',
 'spaghetti',
 'spaghetti-sauce',
 'spanish',
 'spicy',
 'spinach',
 'spreads',
 'spring',
 'squash',
 'squid',
 'st-patricks-day',
 'steak',
 'steaks',
 'steam',
 'stews',
 'stews-poultry',
 'stir-fry',
 'stocks',
 'stove-top',
 'strawberries',
 'stuffings-dressings',
 'sudanese',
 'sugar-cookies',
 'summer',
 'super-bowl',
 'superbowl',
 'swedish',
 'sweet',
 'sweet-sauces',
 'swiss',
 'szechuan',
 'tarts',
 'taste-mood',
 'technique',
 'tempeh',
 'tex-mex',
 'thai',
 'thanksgiving',
 'tilapia',
 'time-to-make',
 'to-go',
 'toddler-friendly',
 'tomatoes',
 'tropical-fruit',
 'trout',
 'tuna',
 'turkey',
 'turkey-breasts',
 'turkey-burgers',
 'turkish',
 'unprocessed-freezer',
 'valentines-day',
 'veal',
 'vegan',
 'vegetables',
 'vegetarian',
 'veggie-burgers',
 'venezuelan',
 'very-low-carbs',
 'vietnamese',
 'water-bath',
 'wedding',
 'weeknight',
 'welsh',
 'white-rice',
 'whitefish',
 'whole-chicken',
 'whole-duck',
 'whole-turkey',
 'wild-game',
 'wings',
 'winter',
 'yams-sweet-potatoes',
 'yeast',
 'zucchini']
In [33]:
# Step 4: Hypothesis Testing — Do baked goods have significantly more sugar than non-baked goods?

baked = filtered_df[filtered_df['is_baked_good'] == True]['sugar (g)']
non_baked = filtered_df[filtered_df['is_baked_good'] == False]['sugar (g)']

observed_stat = baked.mean() - non_baked.mean()

# Setup for permutation test
sugar = filtered_df['sugar (g)'].values
labels = filtered_df['is_baked_good'].values

n_reps = 5000
perm_stats = []

for _ in range(n_reps):
    shuffled_labels = np.random.permutation(labels)
    group1 = sugar[shuffled_labels == True]
    group2 = sugar[shuffled_labels == False]
    stat = group1.mean() - group2.mean()
    perm_stats.append(stat)

perm_stats = np.array(perm_stats)
p_value = np.mean(perm_stats >= observed_stat)

print(f"Observed Statistic: {observed_stat:.4f}")
print(f"p-value: {p_value:.4f}")
Observed Statistic: 21.4139
p-value: 0.0000
In [34]:
# Step 4: Hypothesis Testing — Do non-baked goods have significantly more protein than baked goods?

baked = filtered_df[filtered_df['is_baked_good'] == True]['protein (g)']
non_baked = filtered_df[filtered_df['is_baked_good'] == False]['protein (g)']

observed_stat = non_baked.mean() - baked.mean()

# Setup for permutation test
protein = filtered_df['protein (g)'].values
labels = filtered_df['is_baked_good'].values

n_reps = 5000
perm_stats = []

for _ in range(n_reps):
    shuffled_labels = np.random.permutation(labels)
    group1 = protein[shuffled_labels == True]
    group2 = protein[shuffled_labels == False]
    stat = group2.mean() - group1.mean()
    perm_stats.append(stat)

perm_stats = np.array(perm_stats)
p_value = np.mean(perm_stats >= observed_stat)

print(f"Observed Statistic: {observed_stat:.4f}")
print(f"p-value: {p_value:.4f}")
Observed Statistic: 8.9204
p-value: 0.0000

Step 5: Framing a Prediction Problem¶

In [35]:
# We will be doing a classification problem on predicting the is_baked_good column.
# It is binary classification considering that we only have True or False outputs.
In [36]:
filtered_df.columns
Out[36]:
Index(['name', 'id', 'minutes', 'contributor_id', 'tags', 'n_steps', 'steps',
       'description', 'ingredients', 'n_ingredients', 'user_id', 'recipe_id',
       'rating', 'review', 'avg_rating', 'calories (#)', 'total fat (PDV)',
       'sugar (PDV)', 'sodium (PDV)', 'protein (PDV)', 'saturated fat (PDV)',
       'carbohydrates (PDV)', 'date_review_submitted', 'date_recipe_posted',
       'total fat (g)', 'sugar (g)', 'sodium (mg)', 'protein (g)',
       'saturated fat (g)', 'carbohydrates (g)', 'year', 'is_baked_good'],
      dtype='object')

Step 6: Baseline Model¶

In [37]:
# We will use the sugar and protein columns in the baseline model. 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and label
X = filtered_df[['sugar (g)', 'protein (g)']].copy()
y = filtered_df['is_baked_good']

# Full pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[
    ('classifier', LogisticRegression())
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

       False       0.76      0.96      0.84     40566
        True       0.66      0.22      0.33     16024

    accuracy                           0.75     56590
   macro avg       0.71      0.59      0.59     56590
weighted avg       0.73      0.75      0.70     56590

Step 7: Final Model¶

In [38]:
# adding another column
# We will use the sugar and protein AND SODIUM columns in the baseline model. 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']

# Preprocessing for numeric features
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()

# ColumnTransformer for numeric preprocessing
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)]
)

# Full pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

       False       0.78      0.95      0.85     40566
        True       0.70      0.31      0.43     16024

    accuracy                           0.77     56590
   macro avg       0.74      0.63      0.64     56590
weighted avg       0.76      0.77      0.73     56590

In [39]:
# adding another column
# We will use the sugar and protein AND SODIUM columns in the baseline model. 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']

# Preprocessing for numeric features
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()

# ColumnTransformer for numeric preprocessing
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)]
)

# Full pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced'))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

       False       0.86      0.74      0.80     40566
        True       0.52      0.70      0.60     16024

    accuracy                           0.73     56590
   macro avg       0.69      0.72      0.70     56590
weighted avg       0.77      0.73      0.74     56590

In [40]:
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy().to_numpy()
y = filtered_df['is_baked_good'].to_numpy()

# Preprocessing and model
numeric_features = [0, 1, 2]  # since we're using a NumPy array now
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced'))
])

# K-Fold setup
kf = KFold(n_splits=5, shuffle=True)

# Storage for metrics
accuracies, precisions, recalls, f1s = [], [], [], []

# Run k-fold CV
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    f1s.append(f1_score(y_val, y_pred))

# Report mean results
print(f"Accuracy: {np.mean(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f}")
print(f"F1 Score: {np.mean(f1s):.4f}")
Accuracy: 0.7297
Precision: 0.5166
Recall: 0.7065
F1 Score: 0.5968
In [41]:
# decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']

# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Pipeline with Decision Tree
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y
)

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

       False       0.92      0.95      0.94     40566
        True       0.86      0.80      0.83     16024

    accuracy                           0.91     56590
   macro avg       0.89      0.87      0.88     56590
weighted avg       0.90      0.91      0.90     56590

In [42]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies, precisions, recalls, f1s = [], [], [], []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))

# Display average results
print(f"Average Accuracy:  {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall:    {np.mean(recalls):.4f}")
print(f"Average F1 Score:  {np.mean(f1s):.4f}")
Average Accuracy:  0.9082
Average Precision: 0.8587
Average Recall:    0.8088
Average F1 Score:  0.8330
In [43]:
# decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']

# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Pipeline with Decision Tree
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=10, class_weight='balanced'))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y
)

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

       False       0.88      0.82      0.85     40566
        True       0.61      0.73      0.66     16024

    accuracy                           0.79     56590
   macro avg       0.75      0.77      0.76     56590
weighted avg       0.81      0.79      0.80     56590

In [44]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True)

accuracies, precisions, recalls, f1s = [], [], [], []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))

# Display average results
print(f"Average Accuracy:  {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall:    {np.mean(recalls):.4f}")
print(f"Average F1 Score:  {np.mean(f1s):.4f}")
Average Accuracy:  0.7919
Average Precision: 0.6100
Average Recall:    0.7355
Average F1 Score:  0.6669
In [45]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Features and target
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']]
y = filtered_df['is_baked_good']

# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Pipeline setup
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Parameter grid to search
param_grid = {
    'classifier__max_depth': [4, 6, 8, 10],
    'classifier__min_samples_leaf': [5, 10, 20],
    'classifier__min_samples_split': [10, 20, 40],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__class_weight': ['balanced']
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1',
    verbose=1
)

# Fit grid search
grid_search.fit(X, y)

# Output best results
print("Best parameters:")
print(grid_search.best_params_)
print("\nBest F1 score:")
print(grid_search.best_score_)
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters:
{'classifier__class_weight': 'balanced', 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 20, 'classifier__min_samples_split': 10}

Best F1 score:
0.6075506522398966
In [46]:
# random forests 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and target
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']]
y = filtered_df['is_baked_good']

# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Random Forest pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        bootstrap=True,
        class_weight='balanced'))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y
)

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

       False       0.89      0.85      0.87     40566
        True       0.65      0.72      0.69     16024

    accuracy                           0.81     56590
   macro avg       0.77      0.79      0.78     56590
weighted avg       0.82      0.81      0.82     56590

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Features and target
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']]
y = filtered_df['is_baked_good']

# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced'))
])

# Grid of hyperparameters
param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 6, 10],
    'classifier__bootstrap': [True, False], 
    'classifier__class_weight': ['balanced']
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=2,
    verbose=2
)

# Fit the grid search
grid_search.fit(X, y)

# Output best results
print("✅ Best parameters:")
print(grid_search.best_params_)
print("\n📈 Best F1 score:")
print(grid_search.best_score_)
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   0.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   0.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   0.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   3.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   3.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   3.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=   6.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=   6.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   0.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   0.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   0.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=   6.4s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   2.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   2.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   0.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   0.4s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   0.4s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   2.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   2.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=   4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=   4.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   1.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   0.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=   4.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   0.9s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   4.3s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   4.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   4.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=   8.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=   8.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   0.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   0.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   0.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   1.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   1.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=   8.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   1.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   3.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   3.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   0.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   0.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   0.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   3.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   2.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   2.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   2.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=   5.2s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=   5.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=   5.2s
✅ Best parameters:
{'classifier__bootstrap': True, 'classifier__class_weight': 'balanced', 'classifier__max_depth': 10, 'classifier__n_estimators': 100}

📈 Best F1 score:
0.6226984077766446
In [48]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True)

accuracies, precisions, recalls, f1s = [], [], [], []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))

# Display average results
print(f"Average Accuracy:  {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall:    {np.mean(recalls):.4f}")
print(f"Average F1 Score:  {np.mean(f1s):.4f}")
Average Accuracy:  0.9067
Average Precision: 0.8195
Average Recall:    0.8599
Average F1 Score:  0.8392
In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# --- Feature engineering functions ---

# Calories per ingredient
def add_calories_per_ingredient(X):
    calories = X[:, 0]
    n_ingredients = X[:, 1]
    return (calories / n_ingredients).reshape(-1, 1)

# Description word count
def add_description_word_count(X):
    return X.iloc[:, 0].apply(lambda desc: len(str(desc).split()) if isinstance(desc, str) else 0).to_numpy().reshape(-1, 1)

# --- Define features and target ---

X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps', 'calories (#)', 'n_ingredients', 'description']]
y = filtered_df['is_baked_good']

# --- Unified ColumnTransformer with all transformations ---
preprocessor = ColumnTransformer(transformers=[
    ('scaled_numeric', StandardScaler(), ['sugar (g)', 'protein (g)', 'n_steps']),
    ('cal_per_ing', FunctionTransformer(add_calories_per_ingredient, validate=True), ['calories (#)', 'n_ingredients']),
    ('desc_len', FunctionTransformer(add_description_word_count, validate=False), ['description'])
])

# --- Final pipeline ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        bootstrap=True,
        class_weight='balanced'
    ))
])

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# --- Fit and evaluate ---
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

       False       0.89      0.86      0.88     40566
        True       0.67      0.74      0.71     16024

    accuracy                           0.82     56590
   macro avg       0.78      0.80      0.79     56590
weighted avg       0.83      0.82      0.83     56590

In [50]:
# Grid of hyperparameters
param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 6, 10],
    'classifier__bootstrap': [True, False], 
    'classifier__class_weight': ['balanced']
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=3,
    verbose=2
)

# Fit the grid search
grid_search.fit(X, y)

# Output best results
print("✅ Best parameters:")
print(grid_search.best_params_)
print("\n📈 Best F1 score:")
print(grid_search.best_score_)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   2.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   2.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   9.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   9.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   9.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   9.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=   9.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  17.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  17.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  17.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  17.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   4.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   4.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  18.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   7.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   7.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   7.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   7.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=   7.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   1.6s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   5.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   5.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   5.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   5.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   6.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  11.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  11.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   3.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   3.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   3.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   3.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time=   3.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=  13.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=  13.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=  13.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=  13.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time=  13.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  26.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  27.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  26.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time=   1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   5.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  26.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   5.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   5.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   5.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time=  27.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time=   5.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=  11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=  11.3s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=  11.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   2.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=  11.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time=   2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time=  11.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   8.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   8.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   8.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   8.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time=   8.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  16.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  16.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  16.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  15.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time=  14.4s
✅ Best parameters:
{'classifier__bootstrap': True, 'classifier__class_weight': 'balanced', 'classifier__max_depth': 10, 'classifier__n_estimators': 100}

📈 Best F1 score:
0.6357663756207149
In [51]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True)

accuracies, precisions, recalls, f1s = [], [], [], []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))

# Display average results
print(f"Average Accuracy:  {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall:    {np.mean(recalls):.4f}")
print(f"Average F1 Score:  {np.mean(f1s):.4f}")
Average Accuracy:  0.8243
Average Precision: 0.6719
Average Recall:    0.7418
Average F1 Score:  0.7051

Step 8: Fairness Analysis¶

In [52]:
# TODO